library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(htmltools)
library(devtools)
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.0.4
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.5
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(NbClust)
# Read in Data
nba <-  read_csv("~/Fall21/introDS/DS-3001-New/data/nba2020-21.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Player = col_character(),
##   Pos = col_character(),
##   Tm = col_character()
## )
## i Use `spec()` for the full column specifications.
nba_salaries <- read_csv("~/Fall21/introDS/DS-3001-New/data/nba_salaries_21.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Player = col_character(),
##   `2020-21` = col_double()
## )
nba_final = inner_join(nba, nba_salaries)
## Joining, by = "Player"
#View(nba_final)

#Remove NAs
nba_final <- na.omit(nba_final)

#rename salary varaible
nba_final <- rename(nba_final, Salary = `2020-21`)
#Remove special characters
nba_final$Player <- gsub("[^[:alnum:]]", "", nba_final$Player)

#Cast Pos as factor
nba_final$Pos <- fct_collapse(nba_final$Pos, 
                              C = "C",
                              PF = "PF",
                              PG = c("PG", "PG-SG"),
                              SF = c("SF", "SF-PF"),
                              SG = "SG")

The variables used to cluster the data are GS (games started), MP (minutes played), FG% (field goal percentage), 3P% (three point percentage), 2P% (two point percentage), eFG% (Effective Field Goal Percentage; the formula is (FG + 0.5 * 3P) / FGA), FT% (Free throw percentage), and PTS (points). All the percentages were chosen since they give the relative success out of the total attempts of the statistic rather than just the quantity of attempts or success. Games started, minutes played, and points give career long statistics that demonstrate long term success. Since these variables are on different scales they were normalized on a scale of 0 to 1.

#Select the variables to be included in the cluster 
clust_data_nba <- nba_final[, c("GS", "MP", "FG%", "3P%", "2P%", "eFG%", "FT%", "PTS")]

#Normalization Function 
normalize <- function(x){
 (x - min(x)) / (max(x) - min(x))
}
#select numeric variable names
abc <- names(select_if(clust_data_nba, is.numeric))
#Apply normalizing
clust_data_nba[abc] <- as_tibble(lapply(clust_data_nba[abc], normalize))
#Run the clustering algo with 2 centers
set.seed(1)
kmeans_obj_nba = kmeans(clust_data_nba, centers = 2, 
                        algorithm = "Lloyd")
#View the results
kmeans_obj_nba
## K-means clustering with 2 clusters of sizes 155, 263
## 
## Cluster means:
##           GS        MP       FG%       3P%       2P%      eFG%       FT%
## 1 0.75815170 0.6969126 0.5316692 0.3471419 0.6898684 0.6273779 0.7845161
## 2 0.09906484 0.2881184 0.4906233 0.3192243 0.6632031 0.5856166 0.7362852
##         PTS
## 1 0.4652350
## 2 0.1486547
## 
## Clustering vector:
##   [1] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 1 2 1 2 2 1 2 1 1 1 2 2 2 1 1 2
##  [38] 1 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 2
##  [75] 1 2 1 2 1 2 2 2 2 1 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [112] 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 2 2 2 1 2 1 2 2 1 2 2 1 2
## [149] 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 2
## [186] 2 2 1 2 2 2 2 1 2 1 1 2 1 2 2 2 1 1 2 1 1 2 1 1 2 2 2 2 2 2 2 1 1 2 1 1 2
## [223] 2 2 2 1 2 1 1 1 1 1 2 2 1 2 1 1 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 2 2 2 2 1 1
## [260] 1 2 1 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2
## [297] 1 2 2 2 2 2 1 1 2 2 1 1 1 2 1 2 2 1 1 1 2 1 2 2 1 2 1 1 2 2 2 2 2 2 1 1 1
## [334] 1 1 1 2 2 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 1 2 1 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 1
## [408] 1 1 2 2 1 2 2 2 1 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 21.20299 46.29813
##  (between_SS / total_SS =  50.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

The largest difference in means were the variables games started (GS), Minutes played (MP), and Points scored (PTS). I chose to visualize the clusters with Points scored on the x axis and minutes played on the y axis in 2 dimensions and then added games started on the z axis for the 3 dimensional graph.

#Visualize the output
salary_clusters_nba = as.factor(kmeans_obj_nba$cluster)
ggplot(nba_final, aes(x = PTS, 
                      y = MP,
                      color = Salary,
                      shape = salary_clusters_nba)) + 
  geom_point(size = 6) +
  ggtitle("Minutes Played vs Pts Scored for nba players") +
  xlab("Points Scored (PTS)") +
  ylab("Minutes Played (MP)") +
  scale_shape_manual(name = "Cluster",
                     labels = c("Cluster 1", "Cluster 2"),
                     values = c("1", "2")) +
  theme_light()

There is a clear correlation with the clusters and salary seen in the graph. Cluster 1 is the under performing cluster, and the salary tends to be a darker color therefore they are paid less. Cluster 2 is the better performing cluster, and they have a lighter color overall therefore they are paid more. We will be looking to find players in cluster 2 that are paid less to bring to the team.

#save as a png
ggsave("NbaSalaryClusters.png", 
       width = 10, 
       height = 5.62, 
       units = "in")

The explained variance is about 50%, which could definitely be improved, but overall it is significant enough to say that there are two distinct clusters.

#Evaluate the quality of the clustering 

# Inter-cluster variance,
# "betweenss" is the sum of the distances between points 
# from different clusters.
num_nba = kmeans_obj_nba$betweenss

# Total variance, "totss" is the sum of the distances
# between all the points in the data set.
denom_nba = kmeans_obj_nba$totss

# Variance accounted for by clusters.
(var_exp_nba = num_nba / denom_nba)
## [1] 0.5060049
#Use the function we created to evaluate several different number of clusters

# The function explained_variance wraps our code for calculating 
# the variance explained by clustering.
explained_variance = function(data_in, k){
  
  # Running the kmeans algorithm.
  set.seed(1)
  kmeans_obj = kmeans(data_in, centers = k, algorithm = "Lloyd", iter.max = 30)
  
  # Variance accounted for by clusters:
  # var_exp = intercluster variance / total variance
  var_exp = kmeans_obj$betweenss / kmeans_obj$totss
  var_exp  
}
explained_var_nba = sapply(1:10, explained_variance, data_in = clust_data_nba)
explained_var_nba
##  [1] 1.247996e-15 5.060049e-01 5.948167e-01 6.441040e-01 6.930132e-01
##  [6] 7.153475e-01 7.390406e-01 7.460434e-01 7.579561e-01 7.793619e-01
#Create a elbow chart of the output 

# Data for ggplot2.
elbow_data_nba = data.frame(k = 1:10, explained_var_nba)


# Plotting data.
ggplot(elbow_data_nba, 
       aes(x = k,  
           y = explained_var_nba)) + 
  geom_point(size = 4) +           #<- sets the size of the data points
  geom_line(size = 1) +            #<- sets the thickness of the line
  xlab('k') + 
  ylab('Inter-cluster Variance / Total Variance') + 
  theme_light()

#Use NbClust to select a number of clusters
# Run NbClust.

(nbclust_obj_nba = NbClust(data = clust_data_nba, method = "kmeans"))

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 2 proposed 13 as the best number of clusters 
## * 2 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
## $All.index
##         KL       CH Hartigan     CCC    Scott   Marriot  TrCovW  TraceW
## 2   7.6120 426.1332  91.1709 36.5429 2484.287 1309559.9 78.2423 67.4996
## 3   1.7281 304.6140  58.8137 33.4956 2844.895 1243489.8 61.3276 55.3656
## 4   0.8079 250.8545  60.4507 29.4360 3160.015 1040196.4 55.5908 48.4931
## 5   1.9489 230.1679  38.6908 28.7133 3451.409  809437.0 33.6598 42.3145
## 6   0.8935 208.6162  38.8708 28.5323 3607.021  803280.4 28.9699 38.6900
## 7   3.2326 196.2497  21.0319 29.3469 3949.433  481948.8 24.4999 35.3544
## 8   1.1279 179.3890  18.5829 29.0077 4064.865  477586.8 21.6039 33.6333
## 9   0.3882 165.9967  27.7850 28.7124 4205.075  432197.4 18.9741 32.1750
## 10  0.9977 160.2708  27.3682 29.3560 4388.476  344069.9 17.2411 30.1283
## 11  7.2157 156.2723  11.3830 30.1154 4540.821  289166.9 13.5681 28.2343
## 12  0.1427 146.7125  25.9116 29.6889 4657.821  260114.7 13.2729 27.4662
## 13  7.4351 144.8712   5.2150 30.5591 4765.235  236095.4 12.1167 25.8184
## 14  0.1646 135.5149  21.8158 29.8075 4850.068  223520.5 12.1366 25.4902
## 15 10.1954 133.8564   5.1149 30.5111 4959.189  197637.1 10.7458 24.1842
##    Friedman   Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 2  416.1923 15.2130 0.3193 0.9356     0.4323 1.5255 -93.3563 -1.8088    0.2688
## 3  433.0483 18.5471 0.2860 1.4579     0.2723 1.5286 -80.9202 -1.8152    0.2741
## 4  444.8164 21.1756 0.3137 1.5362     0.2296 2.1261 -88.9810 -2.7689    0.2838
## 5  466.5084 24.2676 0.3080 1.4217     0.2362 1.4919 -43.8538 -1.7239    0.2899
## 6  468.2087 26.5410 0.2889 1.3898     0.2239 1.1471 -14.8763 -0.6694    0.2907
## 7  489.5378 29.0451 0.2759 1.4840     0.2160 0.8771  10.9308  0.7302    0.2708
## 8  497.6069 30.5314 0.2696 1.5375     0.2109 1.2951 -11.8478 -1.1698    0.2603
## 9  508.6330 31.9152 0.2658 1.4611     0.2120 2.6058 -43.1370 -3.1794    0.2525
## 10 526.9319 34.0833 0.2542 1.4656     0.2079 1.2468 -20.1878 -1.0334    0.2486
## 11 540.2173 36.3696 0.2679 1.4621     0.2047 1.5051 -23.1564 -1.7270    0.2400
## 12 551.7180 37.3868 0.2666 1.4754     0.1877 1.7589 -22.0038 -2.2187    0.2309
## 13 557.7145 39.7729 0.3361 1.4433     0.1920 1.1157  -4.7691 -0.5339    0.2273
## 14 569.2706 40.2850 0.3365 1.5515     0.1779 1.5258 -17.5748 -1.7707    0.2193
## 15 577.1240 42.4604 0.3260 1.4295     0.1879 1.2888  -8.9629 -1.1484    0.2127
##       Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  33.7498     0.6378  1.8066  0.4853 0.0775 0.0130  6.0473 0.3721 0.9029
## 3  18.4552     0.5393  0.9801  1.1106 0.0458 0.0149  8.4456 0.3378 0.6370
## 4  12.1233     0.5039  0.5516  1.5202 0.0403 0.0167  8.5139 0.3137 0.4711
## 5   8.4629     0.4836  0.2028  1.9308 0.0805 0.0178  7.4641 0.2939 0.4236
## 6   6.4483     0.4871  0.8099  2.1071 0.0519 0.0190  8.3471 0.2815 0.2744
## 7   5.0506     0.4515  0.7664  2.6397 0.0658 0.0197  8.9887 0.2668 0.2389
## 8   4.2042     0.4327  0.3578  2.9691 0.0658 0.0197 10.0323 0.2604 0.2246
## 9   3.5750     0.4286  0.3255  3.0767 0.0675 0.0201  9.9665 0.2553 0.2137
## 10  3.0128     0.4169  0.5458  3.4006 0.0675 0.0204  9.8946 0.2482 0.2090
## 11  2.5668     0.3954  1.6608  3.9095 0.0736 0.0207  9.6148 0.2418 0.1927
## 12  2.2888     0.3840  0.0806  4.1720 0.0860 0.0212 10.0510 0.2380 0.1796
## 13  1.9860     0.3850 -6.1223  4.2384 0.0956 0.0216 10.1628 0.2324 0.1795
## 14  1.8207     0.3738  0.2149  4.5073 0.0956 0.0217 13.3774 0.2308 0.1730
## 15  1.6123     0.3674 -2.1568  4.7573 0.0761 0.0220 10.6647 0.2252 0.1691
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.7982            68.5152       1.0000
## 3          0.7953            60.2209       1.0000
## 4          0.7631            52.1431       1.0000
## 5          0.7640            41.0886       1.0000
## 6          0.7529            38.0664       1.0000
## 7          0.7417            27.1683       0.6649
## 8          0.6676            25.8879       1.0000
## 9          0.6891            31.5794       1.0000
## 10         0.7549            33.1122       1.0000
## 11         0.6775            32.8385       1.0000
## 12         0.6744            24.6263       1.0000
## 13         0.6806            21.5875       1.0000
## 14         0.6711            24.9985       1.0000
## 15         0.6603            20.5788       1.0000
## 
## $Best.nc
##                      KL       CH Hartigan     CCC    Scott  Marriot  TrCovW
## Number_clusters 15.0000   2.0000   3.0000  2.0000   3.0000      7.0  5.0000
## Value_Index     10.1954 426.1332  32.3573 36.5429 360.6084 316969.6 21.9309
##                 TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
## Number_clusters 3.0000    5.000 13.0000 10.0000 2.0000     2.0000 2.0000
## Value_Index     5.2615   21.692 -1.8739  0.2542 0.9356     0.4323 1.5255
##                 PseudoT2   Beale Ratkowsky    Ball PtBiserial   Frey McClain
## Number_clusters   2.0000  2.0000    6.0000  3.0000     2.0000 2.0000  2.0000
## Value_Index     -93.3563 -1.8088    0.2907 15.2946     0.6378 1.8066  0.4853
##                    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 13.0000      0  2.0000      0 15.0000
## Value_Index      0.0956      0  6.0473      0  0.1691
## 
## $Best.partition
##   [1] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 1 2 1 2 2 1 2 1 1 1 2 2 2 1 1 2
##  [38] 1 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 2
##  [75] 1 2 1 2 1 2 2 2 2 1 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [112] 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 1 2
## [149] 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 2
## [186] 2 2 1 2 2 2 2 1 2 1 1 2 1 2 2 2 1 1 2 1 1 2 1 1 2 2 2 2 2 2 2 1 1 2 1 1 2
## [223] 2 2 2 1 2 1 1 1 1 1 2 2 1 2 1 1 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 2 2 2 2 1 1
## [260] 1 2 1 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2
## [297] 1 2 2 2 2 2 1 1 2 2 1 1 1 2 1 2 2 1 1 1 2 1 2 2 1 2 1 1 2 2 2 2 2 2 1 1 1
## [334] 1 1 1 2 2 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 1 2 1 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 1
## [408] 1 1 2 2 1 2 2 2 1 2 1
# View the output of NbClust.
nbclust_obj_nba
## $All.index
##         KL       CH Hartigan     CCC    Scott   Marriot  TrCovW  TraceW
## 2   7.6120 426.1332  91.1709 36.5429 2484.287 1309559.9 78.2423 67.4996
## 3   1.7281 304.6140  58.8137 33.4956 2844.895 1243489.8 61.3276 55.3656
## 4   0.8079 250.8545  60.4507 29.4360 3160.015 1040196.4 55.5908 48.4931
## 5   1.9489 230.1679  38.6908 28.7133 3451.409  809437.0 33.6598 42.3145
## 6   0.8935 208.6162  38.8708 28.5323 3607.021  803280.4 28.9699 38.6900
## 7   3.2326 196.2497  21.0319 29.3469 3949.433  481948.8 24.4999 35.3544
## 8   1.1279 179.3890  18.5829 29.0077 4064.865  477586.8 21.6039 33.6333
## 9   0.3882 165.9967  27.7850 28.7124 4205.075  432197.4 18.9741 32.1750
## 10  0.9977 160.2708  27.3682 29.3560 4388.476  344069.9 17.2411 30.1283
## 11  7.2157 156.2723  11.3830 30.1154 4540.821  289166.9 13.5681 28.2343
## 12  0.1427 146.7125  25.9116 29.6889 4657.821  260114.7 13.2729 27.4662
## 13  7.4351 144.8712   5.2150 30.5591 4765.235  236095.4 12.1167 25.8184
## 14  0.1646 135.5149  21.8158 29.8075 4850.068  223520.5 12.1366 25.4902
## 15 10.1954 133.8564   5.1149 30.5111 4959.189  197637.1 10.7458 24.1842
##    Friedman   Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 2  416.1923 15.2130 0.3193 0.9356     0.4323 1.5255 -93.3563 -1.8088    0.2688
## 3  433.0483 18.5471 0.2860 1.4579     0.2723 1.5286 -80.9202 -1.8152    0.2741
## 4  444.8164 21.1756 0.3137 1.5362     0.2296 2.1261 -88.9810 -2.7689    0.2838
## 5  466.5084 24.2676 0.3080 1.4217     0.2362 1.4919 -43.8538 -1.7239    0.2899
## 6  468.2087 26.5410 0.2889 1.3898     0.2239 1.1471 -14.8763 -0.6694    0.2907
## 7  489.5378 29.0451 0.2759 1.4840     0.2160 0.8771  10.9308  0.7302    0.2708
## 8  497.6069 30.5314 0.2696 1.5375     0.2109 1.2951 -11.8478 -1.1698    0.2603
## 9  508.6330 31.9152 0.2658 1.4611     0.2120 2.6058 -43.1370 -3.1794    0.2525
## 10 526.9319 34.0833 0.2542 1.4656     0.2079 1.2468 -20.1878 -1.0334    0.2486
## 11 540.2173 36.3696 0.2679 1.4621     0.2047 1.5051 -23.1564 -1.7270    0.2400
## 12 551.7180 37.3868 0.2666 1.4754     0.1877 1.7589 -22.0038 -2.2187    0.2309
## 13 557.7145 39.7729 0.3361 1.4433     0.1920 1.1157  -4.7691 -0.5339    0.2273
## 14 569.2706 40.2850 0.3365 1.5515     0.1779 1.5258 -17.5748 -1.7707    0.2193
## 15 577.1240 42.4604 0.3260 1.4295     0.1879 1.2888  -8.9629 -1.1484    0.2127
##       Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  33.7498     0.6378  1.8066  0.4853 0.0775 0.0130  6.0473 0.3721 0.9029
## 3  18.4552     0.5393  0.9801  1.1106 0.0458 0.0149  8.4456 0.3378 0.6370
## 4  12.1233     0.5039  0.5516  1.5202 0.0403 0.0167  8.5139 0.3137 0.4711
## 5   8.4629     0.4836  0.2028  1.9308 0.0805 0.0178  7.4641 0.2939 0.4236
## 6   6.4483     0.4871  0.8099  2.1071 0.0519 0.0190  8.3471 0.2815 0.2744
## 7   5.0506     0.4515  0.7664  2.6397 0.0658 0.0197  8.9887 0.2668 0.2389
## 8   4.2042     0.4327  0.3578  2.9691 0.0658 0.0197 10.0323 0.2604 0.2246
## 9   3.5750     0.4286  0.3255  3.0767 0.0675 0.0201  9.9665 0.2553 0.2137
## 10  3.0128     0.4169  0.5458  3.4006 0.0675 0.0204  9.8946 0.2482 0.2090
## 11  2.5668     0.3954  1.6608  3.9095 0.0736 0.0207  9.6148 0.2418 0.1927
## 12  2.2888     0.3840  0.0806  4.1720 0.0860 0.0212 10.0510 0.2380 0.1796
## 13  1.9860     0.3850 -6.1223  4.2384 0.0956 0.0216 10.1628 0.2324 0.1795
## 14  1.8207     0.3738  0.2149  4.5073 0.0956 0.0217 13.3774 0.2308 0.1730
## 15  1.6123     0.3674 -2.1568  4.7573 0.0761 0.0220 10.6647 0.2252 0.1691
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.7982            68.5152       1.0000
## 3          0.7953            60.2209       1.0000
## 4          0.7631            52.1431       1.0000
## 5          0.7640            41.0886       1.0000
## 6          0.7529            38.0664       1.0000
## 7          0.7417            27.1683       0.6649
## 8          0.6676            25.8879       1.0000
## 9          0.6891            31.5794       1.0000
## 10         0.7549            33.1122       1.0000
## 11         0.6775            32.8385       1.0000
## 12         0.6744            24.6263       1.0000
## 13         0.6806            21.5875       1.0000
## 14         0.6711            24.9985       1.0000
## 15         0.6603            20.5788       1.0000
## 
## $Best.nc
##                      KL       CH Hartigan     CCC    Scott  Marriot  TrCovW
## Number_clusters 15.0000   2.0000   3.0000  2.0000   3.0000      7.0  5.0000
## Value_Index     10.1954 426.1332  32.3573 36.5429 360.6084 316969.6 21.9309
##                 TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
## Number_clusters 3.0000    5.000 13.0000 10.0000 2.0000     2.0000 2.0000
## Value_Index     5.2615   21.692 -1.8739  0.2542 0.9356     0.4323 1.5255
##                 PseudoT2   Beale Ratkowsky    Ball PtBiserial   Frey McClain
## Number_clusters   2.0000  2.0000    6.0000  3.0000     2.0000 2.0000  2.0000
## Value_Index     -93.3563 -1.8088    0.2907 15.2946     0.6378 1.8066  0.4853
##                    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 13.0000      0  2.0000      0 15.0000
## Value_Index      0.0956      0  6.0473      0  0.1691
## 
## $Best.partition
##   [1] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 1 1 2 1 2 2 1 2 1 1 1 2 2 2 1 1 2
##  [38] 1 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 2
##  [75] 1 2 1 2 1 2 2 2 2 1 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2
## [112] 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 1 2
## [149] 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 2
## [186] 2 2 1 2 2 2 2 1 2 1 1 2 1 2 2 2 1 1 2 1 1 2 1 1 2 2 2 2 2 2 2 1 1 2 1 1 2
## [223] 2 2 2 1 2 1 1 1 1 1 2 2 1 2 1 1 2 2 2 2 1 2 2 1 1 1 1 1 2 2 1 2 2 2 2 1 1
## [260] 1 2 1 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2
## [297] 1 2 2 2 2 2 1 1 2 2 1 1 1 2 1 2 2 1 1 1 2 1 2 2 1 2 1 1 2 2 2 2 2 2 1 1 1
## [334] 1 1 1 2 2 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 1 2 1 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 1
## [408] 1 1 2 2 1 2 2 2 1 2 1
# View the output that shows the number of clusters each method recommends.
nbclust_obj_nba$Best.nc
##                      KL       CH Hartigan     CCC    Scott  Marriot  TrCovW
## Number_clusters 15.0000   2.0000   3.0000  2.0000   3.0000      7.0  5.0000
## Value_Index     10.1954 426.1332  32.3573 36.5429 360.6084 316969.6 21.9309
##                 TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
## Number_clusters 3.0000    5.000 13.0000 10.0000 2.0000     2.0000 2.0000
## Value_Index     5.2615   21.692 -1.8739  0.2542 0.9356     0.4323 1.5255
##                 PseudoT2   Beale Ratkowsky    Ball PtBiserial   Frey McClain
## Number_clusters   2.0000  2.0000    6.0000  3.0000     2.0000 2.0000  2.0000
## Value_Index     -93.3563 -1.8088    0.2907 15.2946     0.6378 1.8066  0.4853
##                    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 13.0000      0  2.0000      0 15.0000
## Value_Index      0.0956      0  6.0473      0  0.1691
#Display the results visually 
freq_k_nba = nbclust_obj_nba$Best.nc[1,]
freq_k_nba = data.frame(freq_k_nba)
#View(freq_k_nba)

# Check the maximum number of clusters suggested.
max(freq_k_nba)
## [1] 15
#essentially resets the plot viewer back to default
#dev.off()

# Plot as a histogram.
ggplot(freq_k_nba,
       aes(x = freq_k_nba)) +
  geom_bar() +
  scale_x_continuous(breaks = seq(0, 15, by = 1)) +
  scale_y_continuous(breaks = seq(0, 12, by = 1)) +
  labs(x = "Number of Clusters",
       y = "Number of Votes",
       title = "Cluster Analysis")

#Using the recommended number of cluster compare the quality of the model 
#with 2 clusters 
# Both the elbow graph and the nbc Cluster method recommend two clusters.  
#Bonus: Create a 3d version of the output

#Add clusters to nba_final dataframe

nba_final$clusters <- (salary_clusters_nba)

# Use plotly to do a 3d imaging 

fig <- plot_ly(nba_final, 
               type = "scatter3d",
               mode="markers",
               symbol = ~clusters, symbols = c('circle', 'square'),
               x = ~PTS, 
               y = ~MP, 
               z = ~GS,
               color = ~Salary, 
               text = ~paste('Player:',Player,
                             "Position:",Pos,
                             "Team:", Tm))


fig